from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
Notebook Model_training, the fifth and pretty much final notebook in the Master Thesis of Robert Sietsma. Contains (mostly) the analysis of all models within the clinical gene panels, including a nice index displayed below.
# Importing some shizzle.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import json
import pickle
from utilities import get_header, genepanel_analysis, auc_analysis_function
from utilities import analyze_auc_per_gene, correct_threshold, read_capice_output
from utilities import full_auc_analysis
from sklearn.metrics import recall_score, precision_score, f1_score
from pathlib import Path
import subprocess
from sklearn.metrics import roc_auc_score
from bokeh.palettes import viridis
import time
import math
from scipy.optimize import curve_fit
# Defining some import and export locations
location = 'rjsietsma'
read_loc = '/home/'+location+'/shared/'
data_expor_loc = '/home/'+location+'/Documents/School/Master_DSLS/Final_Thesis/Past_initial_data/'
img_output_dir = '/home/'+location+'/PycharmProjects/dsls_master_thesis/side_scripts/output_img/'
with open('./umcg_genepanels.json', 'r') as panels:
genepanels = json.load(panels)
genepanels.pop('5GPM', None)
file_loc = './datafiles/train.txt.gz'
header = get_header(file_loc, '#Chrom')
train = pd.read_csv(file_loc, compression='gzip', names=header, comment='#', sep='\t', low_memory=False)
train
with open('./umcg_genepanels.json', 'r') as json_file:
genes = json.load(json_file)
dislipid_genes = genes['Hart- en vaatziekten']
genelist = []
for key, value in dislipid_genes.items():
if key.lower().startswith('dyslipid'):
for g in value:
if g not in genelist:
genelist.append(g)
dislipid_subset = train.loc[train['GeneName'].isin(genelist)]
dislipid_subset
dislipid_subset['label'].value_counts()
# model_gcc = pickle.load(open('./test_output/model_2_0/default_hyper/base/xgb_gcc_cluster.pickle.dat', 'rb'))
# model_gcc.attributes()
# model_github = pickle.load(open('./test_output/model_2_0/default_hyper/base/xgb_github.pickle.dat', 'rb'))
# model_github.attributes()
test_gcc = read_capice_output('./test_output/test_gccmodel_0721.txt')
test_git = read_capice_output('./test_output/test_githubmodel_0721.txt')
print(test_gcc.shape[0])
test_gcc = test_gcc.append(test_git)
print(test_gcc.shape[0])
test_gcc.drop_duplicates(inplace=True)
print(test_gcc.shape[0])
train_base20 = read_capice_output('./test_output/model_2_0/result_files/train_base2_0.txt')
train_base20
# correct_threshold(train_base20)
test_base20 = read_capice_output('./test_output/model_2_0/result_files/test_base2_0.txt')
print("XGBoost 1.1.1, python3.8, threshold 0.02:")
auc_analysis_function(train_base20, test_base20)
train_base20ct = read_capice_output('./test_output/model_2_0/result_files/train_base_xgboost_correctthres.txt')
test_base20ct = read_capice_output('./test_output/model_2_0/result_files/test_base_xgboost_correctthres.txt')
print("XGboost 1.1.1, python3.8, using threshold 0.149:")
auc_analysis_function(train_base20ct, test_base20ct)
Why is base 2.0 model not showing a threshold close to 0.02?
train_base = read_capice_output('./test_output/model_2_0/result_files/train_base_xgboost_0721.txt')
train_base
# train_base_recall, train_base_threshold = correct_threshold(train_base)
train_in = pd.read_csv(
'./datafiles/train.txt.gz',
compression='gzip', sep='\t', low_memory=False)
data = train_base.merge(
train_in[['#Chrom', 'Pos', 'Ref', 'Alt', 'label']],
left_on=['chr', 'pos', 'ref', 'alt'],
right_on=['#Chrom', 'Pos', 'Ref', 'Alt'])
drop_labels = ['#Chrom', 'Pos', 'Ref', 'Alt']
for x in data.columns:
if x.endswith('_x') or x.endswith('_y'):
drop_labels.append(x)
data.drop(columns=drop_labels, inplace=True)
data
def default_threshold(row):
return_value = 0
if row > 0.02:
return_value = 1
return return_value
data['pred'] = data['probabilities'].apply(lambda x: default_threshold(x))
data['label'].replace({'Pathogenic': 1, 'Benign': 0}, inplace=True)
y_true = np.array(data['label'])
y_pred = np.array(data['pred'])
print(f"The recall score of using Python3.8 and threshold 0.02 is: {recall_score(y_true, y_pred)}")
print(f"The precision score of using Python3.8 and threshold 0.02 is: {precision_score(y_true, y_pred)}")
print(f"Resulting in a F1 score of: {f1_score(y_true, y_pred)}")
test_base = read_capice_output('./test_output/model_2_0/result_files/test_base_xgboost_0721.txt')
test_base
print('Using XGboost 0.72.1 on python3.8, default threshold (0.02):')
auc_analysis_function(train_base, test_base)
train_basect = read_capice_output('./test_output/model_2_0/result_files/train_base_xgboost_0721_correctthres.txt')
test_basect = read_capice_output('./test_output/model_2_0/result_files/test_base_xgboost_0721_correctthres.txt')
print("Using XGboost 0.72.1 on python3.8, threshold set to 0.152:")
auc_analysis_function(train_basect, test_basect)
default_hyper_version111_model = pickle.load(open('./test_output/model_2_0/default_hyper/base/unbalanced/xgb_defaulthyper_111.pickle.dat', 'rb'))
default_hyper_version111_model
train_forsure_default = read_capice_output('./test_output/model_2_0/default_hyper/result_files/train_defaulthyper_nothres.txt')
test_forsure_default = read_capice_output('./test_output/model_2_0/default_hyper/result_files/test_defaulthyper_nothres.txt')
print('Model 2.0, default hyperparameters, xgboost 1.1.1, python3.7, threshold 0.02:')
auc_analysis_function(train_forsure_default, test_forsure_default)
# correct_threshold(train_results=train_forsure_default, include_upper=True)
test_forsure_default
train_forsure_oldmodel = read_capice_output('./test_output/model_2_0/default_hyper/result_files/train_xgboost0721_defaulthyper.txt')
test_forsure_oldmodel = read_capice_output('./test_output/model_2_0/default_hyper/result_files/test_xgboost0721_defaulthyper.txt')
print('Model 1.0, default hyperparameters, xgboost 0.72.1, python3.8, threshold 0.02, forced xgboost install:')
auc_analysis_function(train_forsure_oldmodel, test_forsure_oldmodel)
train_python36 = read_capice_output('./test_output/model_2_0/result_files/train_python36_defaultthres.txt')
test_python36 = read_capice_output('./test_output/model_2_0/result_files/test_python36_defaultthres.txt')
print("Using python3.6, xgboost 0.72.1, threshold default:")
auc_analysis_function(train_python36, test_python36)
Confirmed: The 2.0 model performs just as good as model 1.0
model = pickle.load(open('./models/xgb_weightedSample_randomsearch_v2.pickle.dat', 'rb'))
xgbmodel = model.best_estimator_
model_dislipid = pickle.load(open('./models/xgb_weightedSample_randomsearch_dislipid.pickle.dat', 'rb'))
xgbmodel_dislipid = model_dislipid.best_estimator_
dir(xgbmodel_dislipid._Booster)
model_ek = pickle.load(open('./models/xgb_ransearch_ek_dataset.pickle.dat', 'rb'))
model = pickle.load(open('./xgbmodels/xgb_booster_v2.pickle.dat', 'rb'))
model.feature_importances_
model_dislipid = pickle.load(open('./xgbmodels/xgb_booster_dyslipid.pickle.dat', 'rb'))
model_dislipid.feature_importances_
model_ek = model_ek.best_estimator_
model_ek.feature_importances_
file = './datafiles/train_results.txt.gz'
if os.path.isfile(file):
train_original = pd.read_csv(file, sep='\t', low_memory=False)
else:
train_original = pd.DataFrame(columns=['chr', 'pos', 'ref', 'alt', 'prediction'])
train_original
train_new = read_capice_output('./datafiles/train_results_v4.txt.gz')
train_new
merge = train_original[['chr', 'pos','ref','alt','prediction']].merge(train_new[['chr', 'pos','ref','alt','prediction']],
on=['chr', 'pos','ref','alt'])
merge[merge['prediction_x'] != merge['prediction_y']]
if train_original.shape[0] > 0:
print(f"There is a "
f"{merge[merge['prediction_x'] != merge['prediction_y']].shape[0] / train_original.shape[0] * 100}% mismatch.")
file = './datafiles/test_results.txt'
if os.path.isfile(file):
test_original = pd.read_csv(file, sep='\t', low_memory=False)
tellPathogenic_pred = lambda x: "Pathogenic" if x > 0.02 else 'Neutral'
test_original['prediction'] = [tellPathogenic_pred(probability) for probability in test_original['capice']]
test_original.rename(columns={'#Chrom': 'chr', 'Pos':'pos', 'Ref': 'ref', 'Alt':'alt'}, inplace=True)
else:
test_original = pd.DataFrame(columns=['chr', 'pos', 'ref', 'alt', 'prediction'])
test_original
test_new = read_capice_output('./datafiles/test_results_v4.txt.gz')
test_new
merge = test_original[['chr', 'pos','ref','alt','prediction']].merge(test_new[['chr', 'pos','ref','alt','prediction']],
on=['chr', 'pos','ref','alt'])
merge[merge['prediction_x'] != merge['prediction_y']]
if test_original.shape[0] > 0:
print(f"There is a "
f"{merge[merge['prediction_x'] != merge['prediction_y']].shape[0] / test_original.shape[0] * 100}% mismatch.")
Base, default hyper, model 1.0, unbalanced ds
Base, default hyper, unbalanced ds
Base, default hyper, balanced ds
Base, optimal hyper, balanced ds
full_auc_analysis(
curr_setup = 'Base, Default Hyper, model 1.0, balanced ds',
train_loc = './test_output/train_model1.txt',
test_loc = './test_output/test_model1.txt',
auc_analysis_name= 'auc_analysis_model1.csv',
training_set_loc='./datafiles/train.txt.gz',
filter_out='./datafiles/train.txt.gz'
)
# Note, Angioedema is not shown in here,
# since test.txt.gz only contains 3 pathogenic variants for this panel.
#TODO: First calculate AUC of training set, then filter out.
full_auc_analysis(
curr_setup = 'Base, Default Hyper, model 2.0, unbalanced ds',
train_loc = './test_output/model_2_0/default_hyper/base/unbalanced/base_unbalanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/base/unbalanced/base_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_bdhud.csv',
training_set_loc = './datafiles/train.txt.gz',
filter_out='./datafiles/train.txt.gz'
)
# Note, Angioedema is not shown in here,
# since test.txt.gz only contains 3 pathogenic variants for this panel.
full_auc_analysis(
curr_setup = 'Base, Default Hyper, model 2.0, balanced ds',
train_loc = './test_output/model_2_0/default_hyper/base/balanced/base_balanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/base/balanced/base_balanced_test.txt',
auc_analysis_name= 'auc_analysis_bdhbd.csv',
training_set_loc = './test_output/model_2_0/default_hyper/base/balanced/train_balanced_dataset.tsv.gz',
filter_out= './test_output/model_2_0/default_hyper/base/balanced/train_balanced_dataset.tsv.gz'
)
full_auc_analysis(curr_setup = 'Base | Optimal hyper | balanced ds',
training_set_loc='./test_output/model_2_0/random_hyper/base/balanced/train_balanced_dataset.tsv.gz',
train_loc='./test_output/model_2_0/random_hyper/base/balanced/base_balanced_train.txt',
test_loc='./test_output/model_2_0/random_hyper/base/balanced/base_balanced_test.txt',
auc_analysis_name='auc_analysis_randomhyper_base_balanced.csv',
model='./test_output/model_2_0/random_hyper/base/balanced/xgb_optimal_model.pickle.dat',
filter_out='./test_output/model_2_0/random_hyper/base/balanced/train_balanced_dataset.tsv.gz'
)
full_auc_analysis(curr_setup = 'Base | Optimal hyper | unbalanced ds',
training_set_loc='./datafiles/train.txt.gz',
train_loc='./test_output/model_2_0/random_hyper/base/unbalanced/base_unbalanced_train.txt',
test_loc='./test_output/model_2_0/random_hyper/base/unbalanced/base_unbalanced_test.txt',
auc_analysis_name='auc_analysis_randomhyper_base_unbalanced.csv',
model='./test_output/model_2_0/random_hyper/base/unbalanced/xgb_optimal_model.pickle.dat',
filter_out='./datafiles/train.txt.gz'
)
# Note, Angioedema is not shown in here,
# since test.txt.gz only contains 3 pathogenic variants for this panel.
hevz = []
for key, value in genepanels['Hart- en vaatziekten'].items():
for gene in value:
if gene not in hevz:
hevz.append(gene)
hevz
training_dataset = pd.read_csv('./datafiles/train.txt.gz', sep='\t', low_memory=False)
training_dataset
for c in training_dataset.columns:
print(c)
training_dataset[training_dataset['GeneName'].isin(hevz)]
filename = './datafiles/cardiovascular.txt.gz'
if not os.path.isfile(filename):
training_dataset[training_dataset['GeneName'].isin(hevz)].to_csv('./datafiles/cardiovascular.txt.gz', compression='gzip', index=False, sep='\t')
dyslipid_related = []
for key, value in genepanels['Hart- en vaatziekten'].items():
if key.lower().startswith('dyslipid'):
for gene in value:
if gene not in dyslipid_related:
dyslipid_related.append(gene)
dyslipid_related
training_dataset[training_dataset['GeneName'].isin(dyslipid_related)]
filename = './datafiles/dyslipid.txt.gz'
if not os.path.isfile(filename):
training_dataset[training_dataset['GeneName'].isin(dyslipid_related)].to_csv(filename, compression='gzip', index=False, sep='\t')
neuro = []
for key, value in genepanels['Neurogenetica'].items():
for gene in value:
if gene not in neuro:
neuro.append(gene)
neuro
training_dataset[training_dataset['GeneName'].isin(neuro)]
filename = './datafiles/neurogenetics.txt.gz'
if not os.path.isfile(filename):
training_dataset[training_dataset['GeneName'].isin(neuro)].to_csv(filename, compression='gzip', index=False, sep='\t')
hc = []
for key, value in genepanels['Erfelijke Kanker'].items():
for gene in value:
if gene not in hc:
hc.append(gene)
hc
training_dataset[training_dataset['GeneName'].isin(hc)]
filename = './datafiles/hereditarycancer.txt.gz'
if not os.path.isfile(filename):
training_dataset[training_dataset['GeneName'].isin(hc)].to_csv(filename, compression='gzip', index=False, sep='\t')
test_dyslipid = read_capice_output('./test_output/test_results_dyslipid_correct_threshold.txt')
test_dyslipid
Note: this balanced / unbalanced dataset is what the model was trained on
Cardiovascular, default hyperparameters, balanced dataset.
Cardiovascular, default hyperparameters, unbalanced dataset.
Dyslipid, default hyperparameters, balanced dataset.
Dyslipid, default hyperparameters, unbalanced dataset.
Hereditary cancer, default hyperparameters, balanced dataset.
Hereditary cancer, default hyperparameters, unbalanced dataset.
Neurogenetics, default hyperparameters, balanced dataset.
Neurogenetics, default hyperparameters, unbalanced dataset.
Cardiovascular, optimal hyperparameters, balanced dataset.
Cardiovascular, optimal hyperparameters, unbalanced dataset.
Dyslipid, optimal hyperparameters, balanced dataset.
Dyslipid, optimal hyperparameters, unbalanced dataset.
Hereditary cancer, optimal hyperparameters, balanced dataset.
Hereditary cancer, optimal hyperparameters, unbalanced dataset.
Neurogenetics, optimal hyperparameters, balanced dataset.
full_auc_analysis(
curr_setup = 'Cardiovascular, Default Hyper, balanced ds',
train_loc = './test_output/model_2_0/default_hyper/cardiovascular/balanced/cardiovascular_balanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/cardiovascular/balanced/cardiovascular_balanced_test.txt',
auc_analysis_name= 'auc_analysis_cardio_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/cardiovascular/balanced/train_balanced_dataset.tsv.gz',
filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Cardiovascular, Default Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/default_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_cardio_unbalanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz',
filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Dyslipid, Default Hyper, balanced ds',
train_loc = './test_output/model_2_0/default_hyper/dyslipid/balanced/dyslipid_balanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/dyslipid/balanced/dyslipid_balanced_test.txt',
auc_analysis_name= 'auc_analysis_dyslipid_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/dyslipid/balanced/train_balanced_dataset.tsv.gz',
filter_out='./datafiles/dyslipid.txt.gz'
)
full_auc_analysis(
curr_setup = 'Dyslipid, Default Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/default_hyper/dyslipid/unbalanced/dyslipid_unbalanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/dyslipid/unbalanced/dyslipid_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_dyslipid_unbalanced.csv',
training_set_loc='./datafiles/dyslipid.txt.gz',
filter_out='./datafiles/dyslipid.txt.gz'
)
full_auc_analysis(
curr_setup = 'Hereditary cancer, Default Hyper, balanced ds',
train_loc = './test_output/model_2_0/default_hyper/ek/balanced/ek_balanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/ek/balanced/ek_balanced_test.txt',
auc_analysis_name= 'auc_analysis_hc_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/ek/balanced/train_balanced_dataset.tsv.gz',
filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Hereditary cancer, Default Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/default_hyper/ek/unbalanced/ek_unbalanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/ek/unbalanced/ek_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_hc_unbalanced.csv',
training_set_loc='./datafiles/hereditarycancer.txt.gz',
filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Neurogenetics, Default Hyper, balanced ds',
train_loc = './test_output/model_2_0/default_hyper/neuro/balanced/neuro_balanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/neuro/balanced/neuro_balanced_test.txt',
auc_analysis_name= 'auc_analysis_neuro_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/neuro/balanced/train_balanced_dataset.tsv.gz',
filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Neurogenetics, Default Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/default_hyper/neuro/unbalanced/neuro_unbalanced_train.txt',
test_loc = './test_output/model_2_0/default_hyper/neuro/unbalanced/neuro_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_neuro_unbalanced.csv',
training_set_loc='./datafiles/neurogenetics.txt.gz',
filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Cardiovascular, Optimal Hyper, balanced ds',
train_loc = './test_output/model_2_0/random_hyper/cardiovascular/balanced/cardiovascular_balanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/cardiovascular/balanced/cardiovascular_balanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_cardio_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/cardiovascular/balanced/train_balanced_dataset.tsv.gz',
model='./test_output/model_2_0/random_hyper/cardiovascular/balanced/xgb_optimal_model.pickle.dat',
filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Cardiovascular, Optimal Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/random_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/cardiovascular/unbalanced/cardiovascular_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_cardio_unbalanced.csv',
training_set_loc='./datafiles/cardiovascular.txt.gz',
model='./test_output/model_2_0/random_hyper/cardiovascular/unbalanced/xgb_optimal_model.pickle.dat',
filter_out='./test_output/model_2_0/default_hyper/cardiovascular/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Dyslipid, Optimal Hyper, balanced ds',
train_loc = './test_output/model_2_0/random_hyper/dyslipid/balanced/dyslipid_balanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/dyslipid/balanced/dyslipid_balanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_dyslipid_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/dyslipid/balanced/train_balanced_dataset.tsv.gz',
model='./test_output/model_2_0/random_hyper/dyslipid/balanced/xgb_optimal_model.pickle.dat',
filter_out='./datafiles/dyslipid.txt.gz'
)
full_auc_analysis(
curr_setup = 'Dyslipid, Optimal Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/random_hyper/dyslipid/unbalanced/dyslipid_unbalanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/dyslipid/unbalanced/dyslipid_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_dyslipid_unbalanced.csv',
training_set_loc='./datafiles/dyslipid.txt.gz',
model='./test_output/model_2_0/random_hyper/dyslipid/unbalanced/xgb_optimal_model.pickle.dat',
filter_out='./datafiles/dyslipid.txt.gz'
)
full_auc_analysis(
curr_setup = 'Hereditary cancer, Optimal Hyper, balanced ds',
train_loc = './test_output/model_2_0/random_hyper/ek/balanced/ek_balanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/ek/balanced/ek_balanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_ek_balanced.csv',
model='./test_output/model_2_0/random_hyper/ek/balanced/xgb_ransearch.pickle.dat',
training_set_loc='./test_output/model_2_0/default_hyper/ek/balanced/train_balanced_dataset.tsv.gz',
filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Hereditary cancer, Optimal Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/random_hyper/ek/unbalanced/ek_unbalanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/ek/unbalanced/ek_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_ek_unbalanced.csv',
training_set_loc='./datafiles/hereditarycancer.txt.gz',
model='./test_output/model_2_0/random_hyper/ek/unbalanced/xgb_optimal_model.pickle.dat',
filter_out='./test_output/model_2_0/default_hyper/ek/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Neurogenetics, Optimal Hyper, balanced ds',
train_loc = './test_output/model_2_0/random_hyper/neuro/balanced/neuro_balanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/neuro/balanced/neuro_balanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_neuro_balanced.csv',
training_set_loc='./test_output/model_2_0/default_hyper/neuro/balanced/train_balanced_dataset.tsv.gz',
model='./test_output/model_2_0/random_hyper/neuro/balanced/xgb_optimal_model.pickle.dat',
filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
full_auc_analysis(
curr_setup = 'Neurogenetics, Optimal Hyper, unbalanced ds',
train_loc = './test_output/model_2_0/random_hyper/neuro/unbalanced/neuro_unbalanced_train.txt',
test_loc = './test_output/model_2_0/random_hyper/neuro/unbalanced/neuro_unbalanced_test.txt',
auc_analysis_name= 'auc_analysis_randomhyper_neuro_unbalanced.csv',
training_set_loc='./datafiles/neurogenetics.txt.gz',
model='./test_output/model_2_0/random_hyper/neuro/unbalanced/xgb_optimal_model.pickle.dat',
filter_out='./test_output/model_2_0/default_hyper/neuro/unbalanced/splitted_train_dataset.tsv.gz'
)
# Subprocess calling of the incrementing model creations.
location = os.path.join(os.path.abspath('.'))
types = ['balanced', 'unbalanced']
categories = ['cardio', 'dyslipid', 'hc', 'neuro']
unbalanced_train_loc = {
"cardio": 'cardiovascular.txt.gz',
'dyslipid': 'dyslipid.txt.gz',
'hc': 'hereditarycancer.txt.gz',
'neuro': 'neurogenetics.txt.gz'
}
specified_defaults = {
'cardio':{
'balanced': os.path.join(location, 'not_saving_directory', 'cardiovascular_balanced.json'),
'unbalanced': os.path.join(location, 'not_saving_directory', 'cardiovascular_unbalanced.json')
},
'dyslipid':{
'balanced': os.path.join(location, 'not_saving_directory', 'dyslipid_balanced.json'),
'unbalanced': os.path.join(location, 'not_saving_directory', 'dyslipid_unbalanced.json')
},
'hc':{
'balanced': os.path.join(location, 'not_saving_directory', 'ek_balanced.json'),
'unbalanced': os.path.join(location, 'not_saving_directory', 'ek_unbalanced.json')
},
'neuro':{
'balanced': os.path.join(location, 'not_saving_directory', 'neuro_balanced.json'),
'unbalanced': os.path.join(location, 'not_saving_directory', 'neuro_unbalanced.json')
}
}
attempts = ['attempt_2', 'attempt_3']
levels = np.arange(10,100, 10)
level = np.round_(np.arange(0.1,1,0.1), decimals=2)
for tipe in types:
for category in categories:
specified_d = specified_defaults[category][tipe]
if tipe == 'balanced':
input_loc = os.path.join(location, 'output_incrementing_models', tipe, category, 'train_balanced_dataset.tsv.gz')
else:
input_loc = os.path.join(location, 'datafiles', unbalanced_train_loc[category])
for i in range(9):
folder = str(levels[i])
percentage = str(level[i])
for attempt in attempts:
output_loc = os.path.join(location, 'output_incrementing_models', tipe, category, folder, attempt)
if not os.path.isfile(os.path.join(output_loc, 'splitted_test_dataset.tsv.gz')):
command = f'python3 /home/rjsietsma/PycharmProjects/train_capice_model/train_model.py -b {input_loc} -o {output_loc} -s {percentage} -v -d -sd {specified_d}'
print(f"Calling: {command}")
subprocess.call(command.split(' '))
for path in Path('./').rglob('attempt_*'):
if not os.path.isfile(os.path.join(path, 'test_results.txt')):
path = os.path.abspath(path)
input_file = os.path.join(path, 'splitted_test_dataset.tsv.gz')
input_model = os.path.join(path, 'xgb_ransearch.pickle.dat')
output_path = os.path.join(path, 'test_results.txt')
dev_null = Path('/dev/null')
command = f"bash /home/rjsietsma/PycharmProjects/capice/predict.sh {input_file} {input_model} {output_path} {dev_null}"
print(f"Calling command: {command} \n")
subprocess.call(command.split(' '))
def can_be_converted(entry):
return_value = False
try:
entry['chr'] = entry['chr'].astype(np.float64)
return_value = True
except ValueError:
return_value = False
return return_value
incrementing_auc_df_balanced = pd.DataFrame(columns=['panel', 'train_size', 'auc', 'stdev'])
incrementing_auc_df_unbalanced = pd.DataFrame(columns=['panel', 'train_size', 'auc', 'stdev'])
location_output = 'output_incrementing_models'
attempts = ['attempt_1', 'attempt_2', 'attempt_3']
time_bfl = time.time()
for tipe in types:
for category in categories:
for i in range(9):
time_ifl = time.time()
folder = str(levels[i])
percentage_train = 1 - level[i]
if time_ifl - time_bfl > 30:
print("Still processing, currently on:")
print(f"Type: {tipe}")
print(f"Category: {category}")
print(f"Folder: {folder}")
print("\n")
time_bfl = time.time()
aucs = []
sizes = []
for j, attempt in enumerate(attempts):
path_to_file = os.path.join(location_output, tipe, category, folder, attempt)
test_results = read_capice_output(os.path.join(path_to_file, 'test_results.txt'))
test_input = pd.read_csv(os.path.join(path_to_file, 'splitted_test_dataset.tsv.gz'), sep='\t', low_memory=False, usecols=['#Chrom', 'Pos', 'Ref', 'Alt', 'binarized_label'])
train_input = pd.read_csv(os.path.join(path_to_file, 'splitted_train_dataset.tsv.gz'), sep='\t', low_memory=False)
test_input.rename(
columns={'#Chrom': 'chr',
'Pos': 'pos',
'Ref': 'ref',
'Alt': 'alt'},
inplace=True
)
if can_be_converted(test_results):
test_results['chr'] = test_results['chr'].astype(np.float64)
if test_results['chr'].dtype == np.float64:
test_results['chr'] = test_results['chr'].astype(np.int64)
if test_results['chr'].dtype == np.int64:
test_results['chr'] = test_results['chr'].astype(np.object)
test_input['chr'] = test_input['chr'].astype(np.object)
test_input['pos'] = test_input['pos'].astype(np.int64)
merge = test_results.merge(test_input, on=['chr','pos','ref','alt'])
for gene in merge['GeneName'].unique():
subset = merge[merge['GeneName'] == gene]
y_pred = np.array(merge['probabilities'])
y_true = np.array(merge['binarized_label'])
if np.unique(y_true).size > 1:
aucs.append(roc_auc_score(y_true=y_true, y_score=y_pred))
sizes.append(train_input.shape[0])
else:
print(f"Category: {category}, folder: {folder}, attempt: {attempt}, gene {gene} does not have enough datapoints for AUC calculations.")
continue
aucs = np.array(aucs)
sizes = np.array(sizes)
add_df = pd.DataFrame({
'panel': category,
'train_size': sizes.mean(),
'auc': aucs.mean(),
'stdev': aucs.std()
}, index=[0])
if tipe == 'balanced':
incrementing_auc_df_balanced = incrementing_auc_df_balanced.append(add_df, ignore_index=True)
else:
incrementing_auc_df_unbalanced = incrementing_auc_df_unbalanced.append(add_df, ignore_index=True)
def func(x, a, b, c):
return a*np.log(b*x) + c
incrementing_auc_df_balanced
basepanel_performance = {
'cardio': 0.746953,
'hc': 0.854261,
'neuro': 0.869192
}
from bokeh.palettes import Colorblind
import random
categories = incrementing_auc_df_balanced['panel'].unique()
# pallette = magma(categories.size)
# pallette = ('#000000', '#FF00F0', '#00FFFF', '#DC5039')
blind = list(Colorblind[4])
print(blind)
pallette = blind[:]
random.shuffle(pallette)
print(pallette)
colormap = dict(zip(categories, pallette))
incrementing_auc_df_balanced['color'] = incrementing_auc_df_balanced['panel'].map(colormap)
x_axis = np.linspace(0, incrementing_auc_df_balanced['train_size'].max(), int(incrementing_auc_df_balanced['train_size'].max()), endpoint=True)
plt.figure(figsize=(9,7))
for panel in incrementing_auc_df_balanced['panel'].unique():
subset = incrementing_auc_df_balanced[incrementing_auc_df_balanced['panel'] == panel]
color = subset['color'].unique()[0]
plt.errorbar(
x=subset['train_size'],
y=subset['auc'],
yerr=subset['stdev'],
c=color,
fmt='o',
capsize=0,
elinewidth=3,
ls=None
)
x = np.array(subset['train_size'])
y = np.array(subset['auc'])
popt, pcov = curve_fit(func, x, y)
popt = np.round(popt, decimals=4)
formula = f"{popt[0]}*log({popt[1]} * x) + {popt[2]}"
y_exp = func(x_axis, *popt)
plt.plot(x_axis, y_exp, c=color, label=f"{panel} (y={formula})")
if panel in basepanel_performance.keys():
auc = basepanel_performance[panel]
intercept_df = pd.DataFrame(
{
"x_value": x_axis,
"y_exp": y_exp
})
intercept = math.floor(intercept_df.iloc[(intercept_df['y_exp'] - auc).abs().argsort()[:1]]['x_value'].values[0])
plt.plot(x_axis, np.full(x_axis.shape, auc), '--', c=color,
label=f"Base of {panel} (AUC={auc})")
plt.scatter(intercept, auc, s=200, marker='*', c=color,
label=f"Intercept of {panel}: x={intercept}, y={auc}")
plt.legend(loc='upper right', bbox_to_anchor=(1.6,1.025))
plt.xlabel('Train dataset size')
plt.ylabel('AUC')
plt.ylim((0.5,1.01))
plt.title('Incrementing AUC analysis of balanced models.')
plt.show()
basepanel_performance = {
'cardio': 0.659012,
'hc': 0.638193,
'neuro': 0.767613
}
categories = incrementing_auc_df_unbalanced['panel'].unique()
pallette = viridis(categories.size)
colormap = dict(zip(categories, pallette))
incrementing_auc_df_unbalanced['color'] = incrementing_auc_df_unbalanced['panel'].map(colormap)
x_axis = np.linspace(0, incrementing_auc_df_unbalanced['train_size'].max(), int(incrementing_auc_df_unbalanced['train_size'].max()), endpoint=True)
plt.figure(figsize=(9,7))
for panel in incrementing_auc_df_unbalanced['panel'].unique():
subset = incrementing_auc_df_unbalanced[incrementing_auc_df_unbalanced['panel'] == panel]
color = subset['color'].unique()[0]
plt.errorbar(
x=subset['train_size'],
y=subset['auc'],
yerr=subset['stdev'],
c=color,
fmt='o',
capsize=0,
elinewidth=3,
ls=None
)
x = np.array(subset['train_size'])
y = np.array(subset['auc'])
popt, pcov = curve_fit(func, x, y)
popt = np.round(popt, decimals=4)
formula = f"{popt[0]}*log({popt[1]} * x) + {popt[2]}"
y_exp = func(x_axis, *popt)
plt.plot(x_axis, y_exp, c=color, label=f"{panel} (y={formula})")
if panel in basepanel_performance.keys():
auc = basepanel_performance[panel]
intercept_df = pd.DataFrame(
{
"x_value": x_axis,
"y_exp": y_exp
})
intercept = math.floor(intercept_df.iloc[(intercept_df['y_exp'] - auc).abs().argsort()[:1]]['x_value'].values[0])
plt.plot(x_axis, np.full(x_axis.shape, auc), '--', c=color,
label=f"Base of {panel} (AUC={auc})")
plt.scatter(intercept, auc, s=200, marker='*', c=color,
label=f"Intercept of {panel}: x={intercept}, y={auc}")
plt.legend(loc='upper right', bbox_to_anchor=(1.6,1.025))
plt.xlabel('Train dataset size')
plt.ylabel('AUC')
plt.title('Incrementing AUC analysis of unbalanced models.')
plt.show()
variants = pd.read_csv('./datafiles/train.txt.gz', compression='gzip', sep='\t', low_memory=False)
variants
ax = variants['Consequence'].value_counts().plot(kind='bar')
ax.set_title('Type of SNV over the entire training dataset.')
ax.set_ylabel('Count')
ax.set_xlabel('Consequence')
# Reduce the information to just panels.
panels = {}
for panel in genepanels.keys():
panels[panel] = []
disease_panel = genepanels[panel]
if isinstance(disease_panel, dict):
for disease, genes in disease_panel.items():
for gene in genes:
if gene not in panels[panel]:
panels[panel].append(gene)
for panel, genes in panels.items():
title = f'Type of SNV over the entire training dataset, panel: {panel}'
subset = variants[variants['GeneName'].isin(genes)]
data = subset['Consequence'].value_counts()
ax = data.plot.bar(x='index', y='Consequence',
colormap='viridis',
title=title)
ax.set_ylabel('Count')
ax.set_xlabel('Panel')
fig = plt.Figure()
fig.axes.append(ax)
plt.show()